In [5]:
import pandas as pd
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')
data.head()
Out[5]:
| Date | Date Type | Age | Sex | Race | Ethnicity | Residence City | Residence County | Residence State | Injury City | ... | Xylazine | Gabapentin | Opiate NOS | Heroin/Morph/Codeine | Other Opioid | Any Opioid | Other | ResidenceCityGeo | InjuryCityGeo | DeathCityGeo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 05/29/2012 | Date of death | 37.0 | Male | Black | NaN | STAMFORD | FAIRFIELD | NaN | STAMFORD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | STAMFORD, CT\n(41.051924, -73.539475) | STAMFORD, CT\n(41.051924, -73.539475) | CT\n(41.575155, -72.738288) |
| 1 | 06/27/2012 | Date of death | 37.0 | Male | White | NaN | NORWICH | NEW LONDON | NaN | NORWICH | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NORWICH, CT\n(41.524304, -72.075821) | NORWICH, CT\n(41.524304, -72.075821) | Norwich, CT\n(41.524304, -72.075821) |
| 2 | 03/24/2014 | Date of death | 28.0 | Male | White | NaN | HEBRON | NaN | NaN | HEBRON | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | HEBRON, CT\n(41.658069, -72.366324) | HEBRON, CT\n(41.658069, -72.366324) | Marlborough, CT\n(41.632043, -72.461309) |
| 3 | 12/31/2014 | Date of death | 26.0 | Female | White | NaN | BALTIC | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | BALTIC, CT\n(41.617221, -72.085031) | CT\n(41.575155, -72.738288) | Baltic, CT\n(41.617221, -72.085031) |
| 4 | 01/16/2016 | Date of death | 41.0 | Male | White | NaN | SHELTON | FAIRFIELD | CT | SHELTON | ... | NaN | NaN | NaN | NaN | NaN | Y | NaN | SHELTON, CT\n(41.316843, -73.092968) | SHELTON, CT\n(41.316843, -73.092968) | Bridgeport, CT\n(41.179195, -73.189476) |
5 rows × 48 columns
In [6]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Load the dataset
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')
# Convert 'Sex' column: Male -> 0, Female -> 1
data['Sex'] = data['Sex'].map({'Male': 0, 'Female': 1})
# Convert 'Any Opioid' column: Replace NaN/Null with 'No', and 'Yes' stays as is
data['Any Opioid'] = data['Any Opioid'].fillna('No') # Replace NaN with 'No'
# Map 'Yes' to 1 and 'No' to 0 for 'Any Opioid'
data['Any Opioid'] = data['Any Opioid'].map({'Y': 1, 'No': 0})
# Show the unique values of 'Any Opioid'
print(data['Any Opioid'].value_counts())
# Prepare the features and target
X = data[['Age', 'Sex']] # Features (Age and Sex)
y = data['Any Opioid'] # Target (Any Opioid)
# Before proceeding, ensure 'Any Opioid' doesn't have NaN values
print("\nBefore handling NaN in 'Any Opioid':")
print(y.isna().sum()) # Display the number of NaN values in the target column
# Handle NaN values in the 'Any Opioid' column (Replace NaN with 0 or 'No')
y = y.fillna(0) # Fill NaN with 0 to indicate 'No'
# Verify that NaN values were handled correctly
print("\nAfter handling NaN in 'Any Opioid':")
print(y.isna().sum()) # Ensure there are no NaN values left in the target column
# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Record the start time for measuring execution time
start_time = time.time()
# Initialize the Random Forest Classifier with 50 estimators (trees)
clf = RandomForestClassifier(n_estimators=50)
# Train the model using the training data
clf.fit(X_train, y_train)
# Make predictions on the test data
y_pred = clf.predict(X_test)
# Record the end time to calculate the execution time
end_time = time.time()
# Calculate the execution time
exe_time = end_time - start_time
print(f"\nExecution time without parallelization: {exe_time} seconds")
# Evaluate the model performance
CM = confusion_matrix(y_pred, y_test) # Confusion Matrix
print("\nConfusion Matrix:")
print(CM)
AS = accuracy_score(y_pred, y_test) # Accuracy Score
print(f"\nAccuracy Score: {AS}")
CR = classification_report(y_pred, y_test) # Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(CR)
Any Opioid
1.0 8828
0.0 3034
Name: count, dtype: int64
Before handling NaN in 'Any Opioid':
119
After handling NaN in 'Any Opioid':
0
Execution time without parallelization: 0.28926849365234375 seconds
Confusion Matrix:
[[ 4 2]
[ 978 2611]]
Accuracy Score: 0.7273991655076495
Classification Report:
precision recall f1-score support
0.0 0.00 0.67 0.01 6
1.0 1.00 0.73 0.84 3589
accuracy 0.73 3595
macro avg 0.50 0.70 0.43 3595
weighted avg 1.00 0.73 0.84 3595
In [7]:
pip install graphviz
Requirement already satisfied: graphviz in c:\users\sanja\downloads\anaconda\lib\site-packages (0.20.3) Note: you may need to restart the kernel to use updated packages.
In [8]:
from sklearn.tree import export_graphviz
import graphviz
In [9]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[9]:
RandomForestClassifier(n_estimators=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [10]:
len(rf.estimators_)
Out[10]:
1
In [12]:
from sklearn import tree
import matplotlib.pyplot as plt
X = data[['Age', 'Sex']] # Features (Age and Sex)
y = data['Any Opioid']
plt.figure(figsize=(45,35))
_=tree.plot_tree(rf.estimators_[0], filled=True)
In [13]:
%matplotlib inline
plt.figure(figsize=(25,15)) # Increase figure size
tree.plot_tree(rf.estimators_[0], filled=True, feature_names=X.columns, rounded=True, fontsize=12)
plt.show()
In [18]:
import pandas as pd
import time
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Initialize Dask client for parallelization
client = Client(n_workers=6)
# Load the dataset
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')
# Convert 'Sex' column: Male -> 0, Female -> 1
data['Sex'] = data['Sex'].map({'Male': 0, 'Female': 1})
# Convert 'Any Opioid' column: Replace NaN/Null with 'No', and 'Yes' stays as is
data['Any Opioid'] = data['Any Opioid'].fillna('No') # Replace NaN with 'No'
# Map 'Yes' to 1 and 'No' to 0 for 'Any Opioid'
data['Any Opioid'] = data['Any Opioid'].map({'Y': 1, 'No': 0})
# Show the unique values of 'Any Opioid'
print(data['Any Opioid'].value_counts())
# Prepare the features and target
X = data[['Age', 'Sex']] # Features (Age and Sex)
y = data['Any Opioid'] # Target (Any Opioid)
# Before proceeding, ensure 'Any Opioid' doesn't have NaN values
print("\nBefore handling NaN in 'Any Opioid':")
print(y.isna().sum()) # Display the number of NaN values in the target column
# Handle NaN values in the 'Any Opioid' column (Replace NaN with 0 or 'No')
y = y.fillna(0) # Fill NaN with 0 to indicate 'No'
# Verify that NaN values were handled correctly
print("\nAfter handling NaN in 'Any Opioid':")
print(y.isna().sum()) # Ensure there are no NaN values left in the target column
# Split the dataset into training and testing sets (70% training, 30% testing) using Dask
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
# Record the start time for measuring execution time
start_time = time.time()
# Initialize the Random Forest Classifier with 50 estimators (trees)
clf = RandomForestClassifier(n_estimators=50)
# Train the model using the training data
clf.fit(X_train, y_train)
# Make predictions on the test data
y_pred = clf.predict(X_test)
# Record the end time to calculate the execution time
end_time = time.time()
# Calculate the execution time
exe_time = end_time - start_time
print(f"\nExecution time with Dask parallelization: {exe_time} seconds")
# Evaluate the model performance
CM = confusion_matrix(y_pred, y_test) # Confusion Matrix
print("\nConfusion Matrix:")
print(CM)
AS = accuracy_score(y_pred, y_test) # Accuracy Score
print(f"\nAccuracy Score: {AS}")
CR = classification_report(y_pred, y_test) # Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(CR)
# Close the Dask client
client.close()
C:\Users\sanja\Downloads\anaconda\Lib\site-packages\distributed\node.py:187: UserWarning: Port 8787 is already in use. Perhaps you already have a cluster running? Hosting the HTTP server on port 59682 instead warnings.warn( C:\Users\sanja\Downloads\anaconda\Lib\contextlib.py:144: UserWarning: Creating scratch directories is taking a surprisingly long time. (1.46s) This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk. next(self.gen)
Any Opioid
1.0 8828
0.0 3034
Name: count, dtype: int64
Before handling NaN in 'Any Opioid':
119
After handling NaN in 'Any Opioid':
0
Execution time with Dask parallelization: 0.8758988380432129 seconds
Confusion Matrix:
[[ 7 7]
[ 975 2606]]
Accuracy Score: 0.7268428372739917
Classification Report:
precision recall f1-score support
0.0 0.01 0.50 0.01 14
1.0 1.00 0.73 0.84 3581
accuracy 0.73 3595
macro avg 0.50 0.61 0.43 3595
weighted avg 0.99 0.73 0.84 3595
In [19]:
from sklearn.tree import export_graphviz
import graphviz
In [20]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[20]:
RandomForestClassifier(n_estimators=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [21]:
len(rf.estimators_)
Out[21]:
1
In [22]:
from sklearn import tree
X = data[['Age', 'Sex']] # Features (Age and Sex)
y = data['Any Opioid']
plt.figure(figsize=(45,35))
_=tree.plot_tree(rf.estimators_[0], filled=True)
In [23]:
%matplotlib inline
plt.figure(figsize=(25,15)) # Increase figure size
tree.plot_tree(rf.estimators_[0], filled=True, feature_names=X.columns, rounded=True, fontsize=12)
plt.show()
In [ ]: